#!/usr/bin/env python3
#
#  Process the distribution of journal subjects covered by each journal
#  
#  Copyright 2025 Eko Didik Widianto <didik@live.undip.ac.id>
#  
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either version 3 of the License, or
#  (at your option) any later version.
#  
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#  
#  You should have received a copy of the GNU General Public License
#  along with this program. If not, see <https://www.gnu.org/licenses/>.
#

import sys
from pathlib import Path    # Path info

import pandas as pd     # Pandas

# Using seabord plotting
import seaborn as sns
import matplotlib.pyplot as plt

import numpy as np


# Path definition
path_abs = Path(__file__).parent.absolute() # Use absolute path
sys.path.append(f"{path_abs}/..") # Adds higher directory to python modules path
in_path = f"{path_abs}/data_master/"
report_path = f'{path_abs}/reporting/'
result_path = f'{path_abs}/results/'

# Own functions
from util.utils import save_file, open_json_file, save_aprofile

# Data file: Name, master journal data, master publisher data
data_file = [['GARUDA', 'gjournal'],
  ['SINTA', 'sjournal']]

def main():
  
  print("The number of subjects of journals")
  print("================")
  
  jprofiles = []
  cols = ['id', 'title', 'publisher', 'pissn', 'eissn', 'subj_areas']
  
  # Plots
  fig, ax = plt.subplots(ncols=2, figsize=(8, 5), sharey=True, gridspec_kw={'wspace': 0})
  plt.subplots_adjust(left=0.1, right=0.9, 
    top=0.9, bottom=0.2, 
    wspace=0.1, hspace=0.1)
  
  color_idx = ["#fdae61", "#4575b4"]
  ax[0].set_ylabel('Number of subjects per journal', fontsize=11)
  
  for i, mdata in enumerate(data_file):
    src, jour_src = mdata
    
    jprofile = {}
    pprofile = {}
    data_subjs = {}
  
    jdf = open_json_file(in_path + jour_src + '.json')
    jprofile['number of journals'] = len(jdf.index)

    jdf = jdf[jdf['is_active']][cols]  # Filter active journal
    jdf['subj_areas'] = jdf['subj_areas'].fillna('').apply(list)    
    jdf['nsubj_areas'] = jdf['subj_areas'].str.len()
    
    save_file(jdf, f'{result_path}/01-{jour_src}_jour-subj', 'Journal subjects')
      
    # Profiling
    jprofile['filtered journals'] = len(jdf.index)
    jprofile['filtered journals%'] = round(100 * jprofile['filtered journals'] / jprofile['number of journals']) 
    jprofile['not_journal'] = jprofile['number of journals'] - jprofile['filtered journals'] 
    jprofile['src'] = jour_src

    jdf_count = jdf['nsubj_areas'].value_counts().sort_index()
    jdf_countp = jdf['nsubj_areas'].value_counts(normalize=True).sort_index()

    jprofile['subj_count'] = list(map(lambda x, y: f"{x:,d} ({100 * y:.2f} %)", jdf_count, jdf_countp))
    jprofile['subj_count'] = dict(zip(range(len(jprofile['subj_count'])), jprofile['subj_count']))
    
    jdf_ = jdf[jdf['nsubj_areas'] > 0].reset_index(drop=True)
    jprofile['nsubj_mean'] = round(jdf_['nsubj_areas'].mean(), 2)
        
    jq75, jq50, jq25 = np.percentile(jdf_['nsubj_areas'], [75, 50, 25])

    jprofile['nsubj_median'] = jq50
    jprofile['nsubj_iqr'] = jq75 - jq25
    
    jprofiles.append(jprofile)
    
    # Prepare data for plot
    data_subjs[jour_src] = list(jdf_count)
    data_subjs[jour_src+'p'] = list(round (100 * jdf_countp, 2))
    subj_df = pd.DataFrame(data_subjs, index=list(range(11)))
    
    print(i, subj_df)
    
    draw_barplot(subj_df, jour_src, ax[i], title = src + '\nNj=' + f"{len(jdf.index):,d}",
      color = color_idx[i], xreverse=True if i == 0 else False)
    
    # Add box plot
    draw_boxplot(jdf, 'nsubj_areas', ax[i])
  
    ax[i].invert_yaxis()
  
  fig.supxlabel('Number of journals (%)', fontsize=11, y = 0.08)  # Common x label
  
  save_aprofile(jprofiles, report_path + "01-jour_subjs", src="records", desc="01. Number of journal by subject count")
  plt.tight_layout()
  plt.show()

def draw_barplot(df, col, ax, title, color, xreverse=False):
  ax.spines['top'].set_visible(False)
  ax.spines['right'].set_visible(False)
  ax.set_xlabel(' ')
  ax.set_title(title, fontsize=10, y = 0.92)
  ax.margins(x=0.3)

  sns.barplot(data=df, 
    x = col+'p', y = df.index, ax=ax,
    dodge = True, color = color, 
    width = 0.85,
    orient = 'horizontal')
    
  if xreverse:
    ax.invert_xaxis() #  Revert direction
  
  else:
    ax.spines['left'].set_visible(False)

  # Add value
  for i, p in enumerate(ax.patches):
    _y = p.get_y()+ p.get_height() / 4
    _x = p.get_width() 
    val = int(df.at[i, col])
    ax.text(_x + 1, _y, f'{val:,d}', ha='right' if xreverse else 'left')
    
def draw_boxplot(df, col, ax):
  ax_ = ax.twiny()
  
  sns.boxplot(data = df[df[col] > 0].sort_values([col]),
    y = col,
    ax = ax_,
    #whis = (0, 100),
    gap = 0.1,
    width = 0.3, linewidth = 0.8, 
    color = 'black',
    fill = False,
    showmeans=True,
    meanprops={"marker":"o",
      "markerfacecolor":"black", 
      "markeredgecolor":"black",
      "markersize":"5"},
    medianprops={'color': 'red', 'linewidth': 1.5}
    )    
  
  # Custumize boxplot
  ax_.tick_params(labelleft=False, left=False, labelright=False, right = False)
  ax_.spines['left'].set_visible(False)
  ax_.spines['top'].set_visible(False)
  ax_.spines['right'].set_visible(False)
  ax_.set(xlim=(-2, 2))

# Run the script
if __name__ == '__main__':
  main()
